library(tidyverse)
library(rio)
library(plotDK)
library(lfe)
library(plotrix)

# set replication folder as working directory
setwd("~replication")

load("data_genderedcost_long.rdata")
load("data_genderedcost_wide.rdata")
load("data_genderedcost_background.rdata")

# SurveyStatus values meaning:
# 1 is partial answers/timed-out
# 2 is complete answers
# 4 is refuse to answer

# Create a missing variable defined as 0 if complete and 1 otherwise (hence grouping any type of missingness)
# (missingness also covers answers received too late).

df_long <- df_long %>% 
  mutate(missing = case_when(SurveyStatus==2~0,
                             SurveyStatus==1~1,
                             SurveyStatus==4~1,
                             SurveyEndTime<="2021-12-20 21:49:52"~1)) %>% 
  mutate(completed = ifelse(missing==1,0,1))

## select id's for those who answered too late (use for generating missing variable)
too_laters <- df_background %>% 
  filter(SurveyEndTime>"2021-12-20 21:49:52") %>% 
  select(id)

too_laters <- too_laters$id

df_background <- df_background %>%
  mutate(missing = case_when(SurveyStatus==2~0,
                             SurveyStatus==1~1,
                             SurveyStatus==4~1)) %>% 
  mutate(missing = ifelse(id %in% too_laters,1,missing)) %>% 
  mutate(completed = ifelse(missing==1,0,1))

# 1938 complete answers, 453 missing due to incomplete response
df_background %>% 
  count(missing, completed)

### Dropout based on pre-sampling covariates
# gender
t.test(completed~c_Koen, data = df_background)


### FIGURE F1
### Dropout over survey sequence

df_wide <- df_wide %>% 
  mutate(missing = case_when(SurveyStatus==2~0,
                             SurveyStatus==1~1,
                             SurveyStatus==4~1,
                             SurveyEndTime<="2021-12-20 21:49:52"~1))


# create dataframe that express how many candidates remain in the sample at each stage in the questionnaire
df_wide_dropouts <- df_wide %>% 
  # keep relevant variables
  select(c(civilstatus, alder_o1, uddannelse, kom_valg, politik_skala,
           conjoint_1_choice, conjoint_2_choice, conjoint_3_choice, conjoint_4_choice,
           conjoint_5_choice, conjoint_6_choice, conjoint_7_choice, 
           arbejdstid, kraenkelser_1_resp, kraenkelser_2_resp, kraenkelser_3_resp,
           kraenkelser_4_resp, kraenkelser_5_resp, risiko))

## group by task number and calculate share of NA 
df_agg_dropouts <- df_wide_dropouts %>% 
  summarise(na_civil = 1-mean(is.na(civilstatus)),
            na_alder = 1-mean(is.na(alder_o1)),
            na_uddannelse = 1-mean(is.na(uddannelse)),
            na_kom_valg = 1-mean(is.na(kom_valg)),
            na_politik_skala = 1-mean(is.na(politik_skala)),
            na_choice_1 = 1-mean(is.na(conjoint_1_choice)),
            na_choice_2 = 1-mean(is.na(conjoint_2_choice)),
            na_choice_3 = 1-mean(is.na(conjoint_3_choice)),
            na_choice_4 = 1-mean(is.na(conjoint_4_choice)),
            na_choice_5 = 1-mean(is.na(conjoint_5_choice)),
            na_choice_6 = 1-mean(is.na(conjoint_6_choice)),
            na_choice_7 = 1-mean(is.na(conjoint_7_choice)),
            na_arbejdstid = 1-mean(is.na(arbejdstid)),
            na_kraenkelser_1 = 1-mean(is.na(kraenkelser_1_resp)),
            na_kraenkelser_2 = 1-mean(is.na(kraenkelser_2_resp)),
            na_kraenkelser_3 = 1-mean(is.na(kraenkelser_3_resp)),
            na_kraenkelser_4 = 1-mean(is.na(kraenkelser_4_resp)),
            na_kraenkelser_5 = 1-mean(is.na(kraenkelser_5_resp)),
            na_risiko = 1-mean(is.na(risiko)))

# make from wide to long
df_agg_dropouts <- df_agg_dropouts %>% 
  pivot_longer(cols = everything(),
               names_to = c("variable"),
               #names_sep = "_",
               values_to = "survivors")

# questionnaire sequence: define variable for task/question number in survey
df_agg_dropouts$number <- seq(1:19)

## define which steps in the questionnaire that contained the conjoint experiment 
df_agg_dropouts <- df_agg_dropouts %>% 
  mutate(experiments = case_when(number %in% 6:12~"experiment", TRUE~"not experiment"))


df_agg_dropouts %>% 
  ggplot(data=., aes(x=number, y = survivors, fill = experiments)) +
  geom_col(color = "black", width = 0.7) +
  theme_bw() +
  xlab("Question number") +
  ylab("Share of initial participants") +
  scale_fill_manual(values = c("grey75", "white")) +
  scale_x_continuous(breaks = seq(1,19,1), labels = seq(1,19,1)) +
  scale_y_continuous(breaks = seq(0,1,0.1), labels = seq(0,1,0.1)) +
  coord_cartesian(ylim = c(0,1)) +
  geom_vline(xintercept = 5.5, linetype = "dashed") +
  geom_vline(xintercept = 12.5, linetype = "dashed") +
  geom_text(aes(label = round(survivors,digits = 2)),
            position = position_dodge(width = 0.9), vjust = -0.5, size = 3) +
  annotate("text", x = 9, y = 1, label = "Conjoint experiments", size = 3) +
  theme(legend.position = "none",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank())


ggsave("figureF1.pdf", height = 4, width = 8)  


### FIGURE F.2
### Dropout with regards to treatments
# interact the 4 attributes (to get all treatment combinations and estimate a LPM for completion probability)
# remember - this dropout is conditional on taking up the questions

test <- felm(completed ~ position*remuneration*workload*work_environment|0|0|id, data = df_long)

stargazer::stargazer(test, type = "text")

## plot raw probabilities
df_aggr_treatments <- df_long %>% 
  group_by(position,remuneration,workload,work_environment) %>%
  # calculate completion rate conditional on making it to the experiments.
  summarise(mean_completed = mean(completed),
            std_error = std.error(completed)) %>% 
  mutate(treatment_bundle = str_c(position, remuneration, workload, work_environment, sep = "/")) %>% 
  filter(!is.na(position)) # remove NA row

# arrange values within working environment groups
df_aggr_treatments <- df_aggr_treatments %>%
  arrange(work_environment,mean_completed) %>% 
  group_by(work_environment) %>% 
  mutate(placement_completed_facet = row_number()) %>% 
  ungroup()

plot_sort_facet <- df_aggr_treatments %>% 
  ggplot(data=., aes(x = placement_completed_facet, y = mean_completed)) +
  geom_point() +
  geom_linerange(aes(ymin=mean_completed-(std_error*1.96),
                     ymax=mean_completed+(std_error*1.96))) +
  scale_y_continuous(breaks = seq(0.5,1,0.01), labels = seq(0.5,1,0.01)) +
  geom_hline(yintercept = mean(df_aggr_treatments$mean_completed), linetype = "dashed", color = "red") +
  xlab("Treatment bundles") +
  ylab("Completion rate") +
  facet_wrap(~work_environment) +
  theme_bw() +
  coord_cartesian(ylim = c(0.8,1)) +
  theme(legend.position = "top",
        panel.background = element_rect(fill = "white"),
        strip.background = element_rect("white"),
        strip.text = element_text(hjust = 0),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank(),
        axis.line.x = element_blank(),
        axis.text.x = element_blank(),
        axis.ticks.x = element_blank())

plot_sort_facet

ggsave("figureF2.pdf", height = 4, width = 8.5)


### FIGURE 3
# load data aggregated to municipality level
load("data_genderedcost_attrition_mun.rdata")

## make col plot
df_attrition_mun <- df_attrition_mun %>%
  arrange(completers_of_openers) %>% 
  mutate(placement_completed = row_number())

df_attrition_mun %>% 
  filter(muni!="Christiansø") %>%
  ggplot(data=., aes(x = placement_completed, y = completers_of_openers)) +
  geom_col(fill = "white", color = "black") +
  theme_bw() +
  ylab("Completion rate") +
  xlab("") +
  scale_y_continuous(labels = seq(0,1,0.05), breaks = seq(0,1,0.05)) +
  scale_x_continuous(labels = seq(0,100,10), breaks = seq(0,100,10)) +
  theme(legend.position = "right",
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank())

ggsave("figureF3.pdf", height = 4, width = 8)


### FIGURE 4

plot_map <- 
  plotDK::plotDK(data = df_attrition_mun,
                 id = "muni",
                 value = "completers_of_openers",
                 plotlevel = "municipality",
                 show_borders = FALSE)

plot_map +
  scale_fill_viridis_c(option = "D", "Completion rate",
                       #breaks = seq(0.5,1,0.1), labels = seq(0.5,1,0.1),
                       limits = c(0.5,1)) +
  theme(legend.position = c(0.85,0.8))

ggsave("figureF4.pdf", height = 4, width = 8)


### FIGURE 5
# share based on the entire pool

plot_map <- 
  plotDK::plotDK(data = df_attrition_mun,
                 id = "muni",
                 value = "completers_of_pool",
                 plotlevel = "municipality",
                 show_borders = FALSE)

plot_map +
  scale_fill_viridis_c(option = "D", "Complete answers as \nshare of full pool",
                       limits = c(0,0.4)) +
  theme(legend.position = c(0.85,0.8))

ggsave("figureF5.pdf", height = 4, width = 8)



### FIGURE 6
df_attrition_mun %>% 
  ggplot(data = ., aes(x=completers_of_pool)) +
  geom_histogram(bins = 30, color = "black", fill = "gray75") +
  scale_y_continuous(labels = seq(0,14,2), breaks = seq(0,14,2)) +
  scale_x_continuous(labels = seq(0,0.35,0.025), breaks = seq(0,0.35,0.025)) +
  coord_cartesian(xlim = c(0.1,0.35)) +
  ylab("Count of municipalities") +
  xlab("Share of complete answers out of all candidates in a municipality") +
  theme_bw() +
  theme(panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        panel.border = element_blank())

ggsave("plots/figureF6.pdf", height = 3.8, width = 8)


